##########################################
# Replication Data for Proksch, Lowe, Wäckerle, Soroka. (2018). Multilingual Sentiment Analysis: A New Approach to Measuring Conflict in Legislative Speeches. Legislative Studies Quarterly, Forthcoming.
##########################################

#Part 3: Wordscores Replication
#Most of this code follows the replication provided by Herzog and Benoit for the article "The Most Unkindest Cuts: Speaker Selection and Expressed Government Dissent During Economic Crisis"

rm(list = ls(all = TRUE))
library(rstudioapi)

current_path <- getActiveDocumentContext()$path 
setwd(dirname(current_path ))

load("3_positions_after_senti.RData")

require(devtools)
detach("package:quanteda", unload=TRUE)
remove.packages("quanteda")
install_version("quanteda", version = "0.8.0-3", repos = "http://cran.us.r-project.org")
require(quanteda)

# merge in the wordscores estimates
yearspeakerData <- merge(yearspeakerData, textResults, by=c("memberID", "budget_year"), all=TRUE)

# no wordscore but not leas_cc or cc
# View(yearspeakerData[is.na(yearspeakerData$textscore) & !yearspeakerData$cheann_comhairle & !yearspeakerData$leas_cheann_comhairle,] & !yearspeakerData$budget_year==0,])

# (texts(budgetCorpus)[which(docvars(budgetCorpus, "memberID")==1109 & docvars(budgetCorpus, "budget_year")==1984)])

## Add in speech information:
##  nspeeches:  How many recorded speech acts for this speaker, per budget_year
##  nwordstotal:  How many total words this speaker contributed for this budget_year
nspeeches <- aggregate(docvars(budgetCorpus)$speechID,
                       by=list(memberID=docvars(budgetCorpus)$memberID, 
                               budget_year=docvars(budgetCorpus)$budget_year),
                       length)
ntokens <- sapply(texts(budgetCorpus), function(x) length(tokenize(x, simplify=TRUE)))
ncharspeeches <- aggregate(ntokens,
                           by=list(memberID=docvars(budgetCorpus)$memberID, 
                                   budget_year=docvars(budgetCorpus)$budget_year),
                           sum)
temptextdata <- cbind(nspeeches, ncharspeeches[,3])
names(temptextdata)[3:4] <- c("nspeeches", "nwordstotal")
# merge in the speech data
yearspeakerData <- merge(yearspeakerData, 
                         temptextdata, by=c("memberID", "budget_year"), all=TRUE)

## recode leas_cheann_comhairle and cheann_comhairle into position
yearspeakerData$position[yearspeakerData$leas_cheann_comhairle] <- "Leas Cheann Comhairle"
yearspeakerData$position[yearspeakerData$cheann_comhairle] <- "Cheann Comhairle"
yearspeakerData$position_numeric[yearspeakerData$leas_cheann_comhairle] <- 4
yearspeakerData$position_numeric[yearspeakerData$cheann_comhairle] <- 5

# remove year=0000 which takes out the 2009 supplementary budget which we ignore
yearspeakerData <- yearspeakerData[which(yearspeakerData$budget_year!=0), ]

save(yearspeakerData, file="3_yearspeakerData.RData")


## Summary stats on speak numbers
summary(aggregate(textResults$memberID, by=list(textResults$budget_year), function(x) length(unique(x))))

